# Provides a session log for the packages used in this .rmd
library(sessioninfo)
session_info(pkgs = "!attached", to_file = "03a_session_log.txt")
In this script, we go through all the pre-registered proposed analyses. As a reminder, the main research questions where as follows:
Here we present the main “sample theory based” analyses (also known as frequentist), separately on the North American and UK samples in parallel to answer our first two research questions, then together to answer our third research question. In the next section (03b) provide additional Bayesian statistics where a null effect was found, as specified in the pre-registration.
TODO: Delete any libraries that are solely for the Bayes analysis and can be deleted here.
# Library imports, general settings ==============
library(tidyverse)
library(egg)
library(knitr)
library(lme4)
library(lmerTest)
library(simr)
# As in our discussion with Mike, we will use lmerTest for calculating p values
# in the mixed-effects models (noted as a deviation)
library(brms)
library(rstan)
library(future)
plan(multicore, workers = 8) # Swap to multiprocess if running from Rstudio
library(lattice)
library(effects)
library(sjPlot)
library(robustlmm)
library(car)
# Load model comparison functions
source("helper/lrtests.R")
# Deal with package priority issues
select <- dplyr::select
theme_set(theme_bw(base_size = 10))
options("future" = T)
# knitr::opts_chunk$set(cache = TRUE)
print(sessionInfo()) # listing all info about R and packages info
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] furrr_0.3.0 future.apply_1.9.0 bridgesampling_1.1-2
## [4] car_3.1-0 robustlmm_3.0-1 sjPlot_2.8.10
## [7] effects_4.2-1 carData_3.0-5 lattice_0.20-45
## [10] future_1.27.0 rstan_2.21.5 StanHeaders_2.21.0-7
## [13] brms_2.17.0 Rcpp_1.0.9 simr_1.0.6
## [16] lmerTest_3.1-3 lme4_1.1-30 Matrix_1.4-1
## [19] knitr_1.39 egg_0.4.5 gridExtra_2.3
## [22] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.9
## [25] purrr_0.3.4 readr_2.1.2 tidyr_1.2.0
## [28] tibble_3.1.8 ggplot2_3.3.6 tidyverse_1.3.2
## [31] sessioninfo_1.2.2
##
## loaded via a namespace (and not attached):
## [1] readxl_1.4.0 backports_1.4.1 plyr_1.8.7
## [4] igraph_1.3.4 splines_4.2.1 RLRsim_3.1-8
## [7] listenv_0.8.0 crosstalk_1.2.0 rstantools_2.2.0
## [10] inline_0.3.19 digest_0.6.29 htmltools_0.5.3
## [13] fansi_1.0.3 magrittr_2.0.3 checkmate_2.1.0
## [16] googlesheets4_1.0.0 tzdb_0.3.0 globals_0.15.1
## [19] modelr_0.1.8 RcppParallel_5.1.5 matrixStats_0.62.0
## [22] xts_0.12.1 prettyunits_1.1.1 colorspace_2.0-3
## [25] rvest_1.0.2 fastGHQuad_1.0.1 mitools_2.4
## [28] haven_2.5.0 xfun_0.31 callr_3.7.1
## [31] crayon_1.5.1 jsonlite_1.8.0 survival_3.3-1
## [34] zoo_1.8-10 iterators_1.0.14 glue_1.6.2
## [37] gtable_0.3.0 gargle_1.2.0 emmeans_1.7.5
## [40] sjstats_0.18.1 sjmisc_2.8.9 distributional_0.3.0
## [43] pkgbuild_1.3.1 DEoptimR_1.0-11 abind_1.4-5
## [46] scales_1.2.0 mvtnorm_1.1-3 DBI_1.1.3
## [49] ggeffects_1.1.2 miniUI_0.1.1.1 plotrix_3.8-2
## [52] performance_0.9.1 xtable_1.8-4 survey_4.1-1
## [55] stats4_4.2.1 DT_0.23 datawizard_0.4.1
## [58] htmlwidgets_1.5.4 httr_1.4.3 threejs_0.3.3
## [61] posterior_1.2.2 ellipsis_0.3.2 pkgconfig_2.0.3
## [64] loo_2.5.1 farver_2.1.1 nnet_7.3-17
## [67] sass_0.4.2 dbplyr_2.2.1 binom_1.1-1.1
## [70] utf8_1.2.2 effectsize_0.7.0 tidyselect_1.1.2
## [73] rlang_1.0.4 reshape2_1.4.4 later_1.3.0
## [76] munsell_0.5.0 cellranger_1.1.0 tools_4.2.1
## [79] cachem_1.0.6 cli_3.3.0 generics_0.1.3
## [82] sjlabelled_1.2.0 broom_1.0.0 ggridges_0.5.3
## [85] evaluate_0.15 fastmap_1.1.0 yaml_2.3.5
## [88] processx_3.7.0 fs_1.5.2 robustbase_0.95-0
## [91] nlme_3.1-158 mime_0.12 xml2_1.3.3
## [94] compiler_4.2.1 pbkrtest_0.5.1 bayesplot_1.9.0
## [97] shinythemes_1.2.0 rstudioapi_0.13 reprex_2.0.1
## [100] bslib_0.4.0 stringi_1.7.8 parameters_0.18.1
## [103] ps_1.7.1 Brobdingnag_1.2-7 nloptr_2.0.3
## [106] markdown_1.1 shinyjs_2.1.0 tensorA_0.36.2
## [109] vctrs_0.4.1 pillar_1.8.0 lifecycle_1.0.1
## [112] jquerylib_0.1.4 estimability_1.4 insight_0.18.0
## [115] httpuv_1.6.5 R6_2.5.1 promises_1.2.0.1
## [118] parallelly_1.32.1 codetools_0.2-18 boot_1.3-28
## [121] colourpicker_1.1.1 MASS_7.3-57 gtools_3.9.3
## [124] assertthat_0.2.1 withr_2.5.0 shinystan_2.6.0
## [127] bayestestR_0.12.1 mgcv_1.8-40 parallel_4.2.1
## [130] hms_1.1.1 grid_4.2.1 coda_0.19-4
## [133] minqa_1.2.4 rmarkdown_2.14 googledrive_2.0.0
## [136] numDeriv_2016.8-1.1 shiny_1.7.1 lubridate_1.8.0
## [139] base64enc_0.1-3 dygraphs_1.1.1.6
# Read data ======================================
col_types <- cols(
labid = col_factor(),
subid = col_factor(),
subid_unique = col_factor(),
CDI.form = col_factor(),
CDI.nwords = col_integer(),
CDI.prop = col_number(),
CDI.agerange = col_factor(),
CDI.agedays = col_integer(),
CDI.agemin = col_integer(),
CDI.agemax = col_integer(),
vocab_nwords = col_integer(),
standardized.score.CDI = col_character(),
standardized.score.CDI.num = col_number(),
IDS_pref = col_number(),
language = col_factor(),
language_zone = col_factor(),
CDI.error = col_logical(),
Notes = col_character(),
trial_order = col_factor(),
method = col_factor(),
age_days = col_integer(),
age_mo = col_number(),
age_group = col_factor(),
nae = col_logical(),
gender = col_factor(),
second_session = col_logical()
)
data.total <- read_csv("data/02b_processed.csv", col_types = col_types)
Before moving on with the analysis, we have to ready the data by (a)
checking for colinearity between z_age_months and
CDI.z_age_months and correcting this if necessary, and (b)
setting up the contrasts described in our data analysis.
First, we run a Kappa test on the possibility of colinearity between
z_age_months and CDI.z_age_months.
# Run kappa test
k.age_months <- model.matrix(~ z_age_months + CDI.z_age_months, data = data.total) %>%
kappa(exact = T)
With a value of 3.1953332, we do not have a colinearity issue and can proceed with the data analysis as planned (The criteria of indicating colinearity is that kappa > 10).
We need gender as an effect-coded factor, and
method as a deviation-coded factor. This is achieved in R
by using the contr.sum() function with the number of levels
for each factor. Notably, when subsetting the UK sample, only two levels
of method out of the three in total were left.
# Set contrasts on the total dataset =============
contrasts(data.total$gender) <- contr.sum(2)
contrasts(data.total$method) <- contr.sum(3)
# Create sub-datasets, with contrasts ============
## NAE
data.nae <- data.total %>%
subset(language_zone == "NAE") %>%
droplevels()
contrasts(data.nae$gender) <- contr.sum(2)
contrasts(data.nae$method) <- contr.sum(3)
## UK (combined-age and separate 18/24 months)
data.uk <- data.total %>%
subset(language_zone == "British") %>%
droplevels()
contrasts(data.uk$gender) <- contr.sum(2)
contrasts(data.uk$method) <- contr.sum(2) # note that UK sample has only 2 levels, so sum of zero contrasts set to 2 levels
data.uk.18 <- data.total %>%
subset(language_zone == "British" & CDI.agerange ==
"18") %>%
droplevels()
contrasts(data.uk.18$gender) <- contr.sum(2)
contrasts(data.uk.18$method) <- contr.sum(2) # note that UK sample has only 2 levels, so sum of zero contrasts set to 2 levels
data.uk.24 <- data.total %>%
subset(language_zone == "British" & CDI.agerange ==
"24") %>%
droplevels()
contrasts(data.uk.24$gender) <- contr.sum(2)
contrasts(data.uk.24$method) <- contr.sum(2) # note that UK sample has only 2 levels, so sum of zero contrasts set to 2 levels
## Other
data.other <- data.total %>%
subset(language_zone == "Other") %>%
droplevels()
contrasts(data.other$gender) <- contr.sum(2)
contrasts(data.other$method) <- contr.sum(3)
We first assess the amount of data we have overall per condition and their shape overall.
data.total %>%
group_by(language_zone, CDI.agerange, method, gender) %>%
summarise(N = n(), age = mean(CDI.agedays), sd = sd(CDI.agedays)) %>%
kable()
## `summarise()` has grouped output by 'language_zone', 'CDI.agerange', 'method'.
## You can override using the `.groups` argument.
| language_zone | CDI.agerange | method | gender | N | age | sd |
|---|---|---|---|---|---|---|
| British | 18 | singlescreen | F | 24 | 551.2083 | 10.741950 |
| British | 18 | singlescreen | M | 24 | 550.5000 | 13.497182 |
| British | 18 | eyetracking | F | 9 | 548.5556 | 9.139353 |
| British | 18 | eyetracking | M | 11 | 547.8182 | 10.147100 |
| British | 24 | singlescreen | F | 18 | 741.6111 | 13.128948 |
| British | 24 | singlescreen | M | 15 | 745.0667 | 15.549307 |
| British | 24 | eyetracking | F | 13 | 731.5385 | 12.162890 |
| British | 24 | eyetracking | M | 5 | 737.8000 | 8.228001 |
| Other | 18 | singlescreen | F | 11 | 541.5455 | 7.160498 |
| Other | 18 | singlescreen | M | 13 | 544.3077 | 6.663140 |
| Other | 18 | eyetracking | F | 26 | 556.0769 | 14.133430 |
| Other | 18 | eyetracking | M | 30 | 560.8333 | 16.128970 |
| Other | 18 | hpp | F | 38 | 549.0526 | 10.812774 |
| Other | 18 | hpp | M | 38 | 555.1579 | 13.664961 |
| Other | 24 | singlescreen | F | 10 | 731.3000 | 14.802777 |
| Other | 24 | singlescreen | M | 12 | 721.1667 | 13.888081 |
| Other | 24 | eyetracking | F | 28 | 730.1786 | 9.996494 |
| Other | 24 | eyetracking | M | 25 | 730.1600 | 7.711896 |
| Other | 24 | hpp | F | 45 | 730.7333 | 17.357733 |
| Other | 24 | hpp | M | 35 | 730.5143 | 15.849237 |
| NAE | 18 | singlescreen | F | 19 | 554.9474 | 20.780530 |
| NAE | 18 | singlescreen | M | 14 | 581.2143 | 24.925052 |
| NAE | 18 | eyetracking | F | 1 | 549.0000 | NA |
| NAE | 18 | hpp | F | 56 | 560.9821 | 21.584920 |
| NAE | 18 | hpp | M | 57 | 559.6140 | 21.163272 |
| NAE | 24 | singlescreen | F | 23 | 737.1739 | 26.604258 |
| NAE | 24 | singlescreen | M | 20 | 739.6000 | 21.352678 |
| NAE | 24 | eyetracking | F | 2 | 756.5000 | 34.648232 |
| NAE | 24 | eyetracking | M | 1 | 745.0000 | NA |
| NAE | 24 | hpp | F | 54 | 733.9630 | 27.476895 |
| NAE | 24 | hpp | M | 60 | 727.9500 | 27.409899 |
More detailed information about Descriptive Statistics
# number of lab
data.total %>%
select(labid, language_zone) %>%
unique() %>%
group_by(language_zone) %>%
count()
## # A tibble: 3 × 2
## # Groups: language_zone [3]
## language_zone n
## <fct> <int>
## 1 British 4
## 2 Other 8
## 3 NAE 8
data.total %>%
group_by(language_zone, CDI.agerange) %>%
summarize(N = n())
## `summarise()` has grouped output by 'language_zone'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 3
## # Groups: language_zone [3]
## language_zone CDI.agerange N
## <fct> <fct> <int>
## 1 British 18 68
## 2 British 24 51
## 3 Other 18 156
## 4 Other 24 155
## 5 NAE 18 147
## 6 NAE 24 160
# age range in each age group and language zone
data.total %>%
select(subid, language_zone, CDI.agedays, CDI.agerange) %>%
unique() %>%
group_by(language_zone, CDI.agerange) %>%
summarize(
age_min = (min(CDI.agedays) / 365.25 * 12),
age_max = (max(CDI.agedays) / 365.25 * 12)
)
## `summarise()` has grouped output by 'language_zone'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 4
## # Groups: language_zone [3]
## language_zone CDI.agerange age_min age_max
## <fct> <fct> <dbl> <dbl>
## 1 British 18 17.4 19.2
## 2 British 24 23.4 25.1
## 3 Other 18 17.4 19.5
## 4 Other 24 22.5 25.2
## 5 NAE 18 16.6 20.0
## 6 NAE 24 22.2 25.9
We then assess the data per lab in terms of sample size and CDI score (vocabulary size, for consistency between language zones).
by_lab <- data.total %>%
group_by(labid, language_zone, CDI.agerange) %>%
mutate(tested = n_distinct(subid_unique)) %>%
select(labid, language_zone, CDI.agerange, tested, vocab_nwords) %>%
nest(scores = vocab_nwords) %>%
mutate(
model = map(scores, ~ lm(vocab_nwords ~ 1, data = .x)),
ci = map(model, confint)
) %>%
transmute(
tested = tested,
mean = map_dbl(model, ~ coefficients(.x)[[1]]),
ci_lower = map_dbl(ci, 1),
ci_upper = map_dbl(ci, 2)
) %>%
arrange(language_zone) %>%
rownames_to_column() %>%
ungroup()
# created a new column with the labs IDs as character so it can be sorted in alphabetical order
by_lab$labid_car <- as.character(by_lab$labid)
# relabel the CDI age factor column
levels(by_lab$CDI.agerange) <- c("18 months", "24 months")
# Making sure the factor columns have levels in the order I would like to graph them
by_lab$language_zone <- factor(by_lab$language_zone, levels = c("British", "NAE", "Other"))
# sorted the idcolum in the way I would it to show in the ggplot
labid_ord <- by_lab %>%
dplyr::arrange(language_zone, desc(labid_car)) %>%
ungroup() %>%
filter(CDI.agerange == "18 months") %>%
select(labid)
# Making sure the factor columns have levels in the order I would like to graph them
by_lab$labid <- factor(by_lab$labid, levels = labid_ord$labid)
## graph by language zone and lab Id in asc order
by_lab %>%
ggplot(
.,
aes(
x = labid,
y = mean, colour = language_zone,
ymin = ci_lower, ymax = ci_upper
)
) +
geom_linerange() +
geom_point(aes(size = tested)) +
coord_flip(ylim = c(0, 500)) +
xlab("Lab") +
ylab("Vocabulary size") +
scale_colour_brewer(
palette = "Dark2", name = "Language zone",
breaks = c("British", "NAE", "Other")
) +
scale_size_continuous(name = "N", breaks = function(x) c(min(x), mean(x), max(x))) +
facet_wrap(vars(CDI.agerange)) +
theme(
legend.position = "bottom",
strip.background = element_blank(),
strip.placement = "outside",
text = element_text(size = 22)
)
ggsave("plots/vocab-size_by-lab.png", width = 12, height = 8)
First, we want to assess quickly if there is a direct correlation between IDS preference and CDI score, computing a Pearson#’s product-moment correlation. We use standardized CDI scores for the North American sample, and raw scores for the British sample. Since CDI grows with age, we run the British sample separately for 18 and 24 months.
# Statistics =====================================
## North American Sample
test.pearson.nae <- cor.test(data.nae$IDS_pref,
data.nae$z_standardized_CDI,
alternative = "two.sided", method = "pearson"
)
test.pearson.nae
##
## Pearson's product-moment correlation
##
## data: data.nae$IDS_pref and data.nae$z_standardized_CDI
## t = -0.91847, df = 305, p-value = 0.3591
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.16349833 0.05977293
## sample estimates:
## cor
## -0.05251901
## UK Sample
test.pearson.uk.18 <- cor.test(data.uk.18$IDS_pref,
data.uk.18$z_vocab_nwords,
alternative = "two.sided", method = "pearson"
)
test.pearson.uk.18
##
## Pearson's product-moment correlation
##
## data: data.uk.18$IDS_pref and data.uk.18$z_vocab_nwords
## t = 0.70855, df = 66, p-value = 0.4811
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1547438 0.3187097
## sample estimates:
## cor
## 0.08688698
test.pearson.uk.24 <- cor.test(data.uk.24$IDS_pref,
data.uk.24$z_vocab_nwords,
alternative = "two.sided", method = "pearson"
)
test.pearson.uk.24
##
## Pearson's product-moment correlation
##
## data: data.uk.24$IDS_pref and data.uk.24$z_vocab_nwords
## t = 0.36606, df = 49, p-value = 0.7159
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2266228 0.3231553
## sample estimates:
## cor
## 0.05222234
Plots for correlation
## North American Sample
### Get correlation value for annotation
cor_text <- "paste(italic(R)^2, \" =\")"
cor_value <- round(test.pearson.nae$estimate, 3)
### Build plot
plot.pearson.nae <- data.nae %>%
ggplot(aes(
x = IDS_pref,
y = standardized.score.CDI.num
)) +
xlab("IDS preference") +
ylab("Standardized CDI score") +
geom_point() +
geom_smooth(method = lm) +
annotate("text",
x = -.9, y = 51, parse = T, size = 4,
label = paste(cor_text, cor_value, sep = "~")
)
## UK Sample
cor_value <- round(test.pearson.uk.18$estimate, 3)
plot.pearson.uk.18 <- data.uk.18 %>%
ggplot(aes(
x = IDS_pref,
y = vocab_nwords
)) +
xlab("IDS preference") +
ylab("Vocabulary size (in words)") +
geom_point() +
geom_smooth(method = lm) +
annotate("text",
x = .8, y = 150, parse = T, size = 4,
label = paste(cor_text, cor_value, sep = "~")
)
cor_value <- round(test.pearson.uk.24$estimate, 3)
plot.pearson.uk.24 <- data.uk.24 %>%
ggplot(aes(
x = IDS_pref,
y = vocab_nwords
)) +
xlab("IDS preference") +
ylab("Vocabulary size (in words)") +
geom_point() +
geom_smooth(method = lm) +
annotate("text",
x = .8, y = 150, parse = T, size = 4,
label = paste(cor_text, cor_value, sep = "~")
)
# Global plot
plot.pearson <- ggarrange(plot.pearson.nae, plot.pearson.uk.18, plot.pearson.uk.24, ncol = 3)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
plot.pearson
# TODO: These plots need to be cleaned up!
ggsave("plots/corr_plot.pdf", plot.pearson,
units = "mm", width = 180, height = 100, dpi = 1000
)
We see no obvious direct link between IDS prefernce and CDI score here. However, an effect might appear once we take into account various factors that might interact with IDS preference and/or CDI score. We can also first enhance these plots with information about the age group at which infants were tested (18- or 24-month-old) for the NAE sample, using vocabulary size to better compare the NAE and UK samples.
plot.age_group <- data.total %>%
subset(language_zone != "Other") %>%
droplevels() %>%
ggplot(aes(
x = IDS_pref,
y = vocab_nwords,
colour = CDI.agerange
)) +
facet_wrap(vars(language_zone),
labeller = as_labeller(c(
"British" = "UK samples",
"NAE" = "North Amercian samples"
))
) +
xlab("Standardized IDS prefence score") +
ylab("Vocabulary size (in words)") +
theme(legend.position = "top") +
geom_point() +
geom_smooth(method = lm) +
scale_colour_brewer(
palette = "Dark2", name = "Age group",
breaks = c("18", "24"),
labels = c("18mo", "24m")
)
ggsave("plots/scatter_age.pdf", plot.age_group,
units = "mm", width = 180, height = 100, dpi = 1000
)
## `geom_smooth()` using formula 'y ~ x'
(plot.age_group)
## `geom_smooth()` using formula 'y ~ x'
Here, we run a mixed-effects model including only theoretically motivated effects, as described in the pre-registration. We start with the full model bellow, simplifying the random effects structure until it converges.
# Run models =====================================
## NAE
data.nae$centered_IDS_pref <- scale(data.nae$IDS_pref, center = T, scale = F)
lmer.full.nae <- lmer(standardized.score.CDI.num ~ CDI.z_age_months + gender +
z_age_months + method + centered_IDS_pref +
centered_IDS_pref:method + centered_IDS_pref:CDI.z_age_months + centered_IDS_pref:z_age_months +
(1 | labid) + (1 | subid_unique),
data = data.nae
)
summary(lmer.full.nae)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## standardized.score.CDI.num ~ CDI.z_age_months + gender + z_age_months +
## method + centered_IDS_pref + centered_IDS_pref:method + centered_IDS_pref:CDI.z_age_months +
## centered_IDS_pref:z_age_months + (1 | labid) + (1 | subid_unique)
## Data: data.nae
##
## REML criterion at convergence: 2806.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.2240 -0.4550 -0.0619 0.4779 2.5238
##
## Random effects:
## Groups Name Variance Std.Dev.
## subid_unique (Intercept) 531.62 23.06
## labid (Intercept) 37.34 6.11
## Residual 246.81 15.71
## Number of obs: 307, groups: subid_unique, 211; labid, 8
##
## Fixed effects:
## Estimate Std. Error df t value
## (Intercept) 32.68863 6.42717 48.76105 5.086
## CDI.z_age_months 0.04467 0.35954 121.35781 0.124
## gender1 -1.51119 1.88476 193.54899 -0.802
## z_age_months -0.02603 0.62952 69.26913 -0.041
## method1 6.45062 6.51547 121.14260 0.990
## method2 -13.61370 11.77122 176.84942 -1.157
## centered_IDS_pref -33.98215 24.82232 210.07506 -1.369
## method1:centered_IDS_pref 28.35879 25.46036 210.29032 1.114
## method2:centered_IDS_pref -59.58231 49.55101 209.34368 -1.202
## CDI.z_age_months:centered_IDS_pref 0.41746 1.10252 123.90420 0.379
## z_age_months:centered_IDS_pref 2.01470 2.07816 189.23347 0.969
## Pr(>|t|)
## (Intercept) 5.82e-06 ***
## CDI.z_age_months 0.901
## gender1 0.424
## z_age_months 0.967
## method1 0.324
## method2 0.249
## centered_IDS_pref 0.172
## method1:centered_IDS_pref 0.267
## method2:centered_IDS_pref 0.231
## CDI.z_age_months:centered_IDS_pref 0.706
## z_age_months:centered_IDS_pref 0.334
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) CDI.z__ gendr1 z_g_mn methd1 methd2 c_IDS_ m1:_ID m2:_ID
## CDI.z_g_mnt -0.056
## gender1 -0.033 0.022
## z_age_mnths 0.051 -0.037 -0.023
## method1 -0.711 0.001 0.022 0.106
## method2 0.878 -0.031 -0.037 0.017 -0.875
## cntrd_IDS_p 0.330 0.007 0.039 -0.040 -0.323 0.359
## mthd1:_IDS_ -0.310 -0.029 -0.027 0.019 0.335 -0.357 -0.927
## mthd2:_IDS_ 0.324 0.022 0.060 -0.010 -0.336 0.363 0.966 -0.971
## CDI.__:_IDS -0.010 -0.006 -0.019 -0.018 -0.030 0.010 -0.047 0.027 -0.047
## z_g_m:_IDS_ -0.041 0.028 0.103 0.027 -0.035 0.019 0.035 -0.086 0.132
## CDI.__:
## CDI.z_g_mnt
## gender1
## z_age_mnths
## method1
## method2
## cntrd_IDS_p
## mthd1:_IDS_
## mthd2:_IDS_
## CDI.__:_IDS
## z_g_m:_IDS_ -0.071
# robust_lmer.full.nae <- robustlmm::rlmer(standardized.score.CDI.num ~ CDI.z_age_months + gender +
# z_age_months + method + centered_IDS_pref +
# centered_IDS_pref:method + centered_IDS_pref:CDI.z_age_months + centered_IDS_pref:z_age_months +
# (1 | labid),
# data = data.nae)
#
#
# summary(robust_lmer.full.nae) #this model is used to see if we can meet some statistical assumption better but we decided to use the original model as the inferential statistical results are consistent
full.nae_pvalue <- anova(lmer.full.nae) %>%
as_tibble(rownames = "Parameter") # this gives us the Type III p values
# ==========
# Sequentially removed random effects:
# IDS_pref:z_age_months
# IDS_pref:CDI.z_age_months
# IDS_pref:method
# IDS_pref
# method
# z_age_months
# gender
# CDI.z_age_months
# ==========
# First, check linearity
# data.nae$resid <- residuals(lmer.full.nae)
#
# plot(data.nae$resid, data.nae$standardized.score.CDI)
# Second, check normality
plot_model(lmer.full.nae, type = "diag") ## we do have right-skewed normality of residuals
## [[1]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[2]]
## [[2]]$subid_unique
## `geom_smooth()` using formula 'y ~ x'
##
## [[2]]$labid
## `geom_smooth()` using formula 'y ~ x'
##
##
## [[3]]
##
## [[4]]
## `geom_smooth()` using formula 'y ~ x'
# Third, check autocorrelation
re_run_lme.full.nae <- nlme::lme(standardized.score.CDI.num ~ CDI.z_age_months + gender +
z_age_months + method + centered_IDS_pref +
centered_IDS_pref:method + centered_IDS_pref:CDI.z_age_months + centered_IDS_pref:z_age_months,
random = ~ 1 | labid,
method = "REML",
data = data.nae, na.action = na.exclude
)
plot(nlme::ACF(re_run_lme.full.nae, resType = "normalized")) # there is no sign for autocorrelation
# Lastly, check multi-collinearity
car::vif(lmer.full.nae) # we do see a multicollineartiy for the IDS preference variable, even though we have centered the IDS preference score. It is probably related to the number the participating labs (as this is the group level that we are controlling) and how we entered interaction between IDS preference and other variables (that lack variability in the current sample). We need to keep IDS preference in the model as exploring the relationship between IDS preference and CDI score is the key research question in the paper.
## GVIF Df GVIF^(1/(2*Df))
## CDI.z_age_months 1.009197 1 1.004588
## gender 1.039338 1 1.019479
## z_age_months 1.095177 1 1.046507
## method 1.258318 2 1.059126
## centered_IDS_pref 19.302592 1 4.393472
## method:centered_IDS_pref 20.839041 2 2.136581
## CDI.z_age_months:centered_IDS_pref 1.014438 1 1.007193
## z_age_months:centered_IDS_pref 1.264833 1 1.124648
lmer.full.uk <- lmer(vocab_nwords ~ CDI.z_age_months + gender +
z_age_months + method + IDS_pref +
IDS_pref:method + IDS_pref:CDI.z_age_months + IDS_pref:z_age_months +
(1 | labid) + (1 | subid_unique),
data = data.uk
)
summary(lmer.full.uk)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: vocab_nwords ~ CDI.z_age_months + gender + z_age_months + method +
## IDS_pref + IDS_pref:method + IDS_pref:CDI.z_age_months +
## IDS_pref:z_age_months + (1 | labid) + (1 | subid_unique)
## Data: data.uk
##
## REML criterion at convergence: 1326
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.78175 -0.38449 0.01193 0.37856 1.94702
##
## Random effects:
## Groups Name Variance Std.Dev.
## subid_unique (Intercept) 5412.10 73.567
## labid (Intercept) 33.24 5.765
## Residual 2637.21 51.354
## Number of obs: 119, groups: subid_unique, 88; labid, 4
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 175.945 11.558 3.774 15.223 0.000158 ***
## CDI.z_age_months 28.905 1.964 44.974 14.714 < 2e-16 ***
## gender1 19.043 9.820 78.517 1.939 0.056074 .
## z_age_months -5.065 3.455 70.444 -1.466 0.147087
## method1 -11.249 13.255 5.502 -0.849 0.431430
## IDS_pref 10.003 26.512 83.053 0.377 0.706906
## method1:IDS_pref -7.789 29.223 87.125 -0.267 0.790455
## CDI.z_age_months:IDS_pref 1.573 5.726 50.812 0.275 0.784676
## z_age_months:IDS_pref 1.154 8.805 89.735 0.131 0.896025
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) CDI.z__ gendr1 z_g_mn methd1 IDS_pr m1:IDS CDI.__:
## CDI.z_g_mnt 0.127
## gender1 0.043 -0.069
## z_age_mnths 0.013 -0.099 -0.019
## method1 -0.383 -0.074 -0.090 0.490
## IDS_pref -0.362 -0.076 -0.118 0.023 0.197
## mthd1:IDS_p 0.186 0.094 0.007 -0.117 -0.303 -0.193
## CDI.__:IDS_ -0.100 -0.328 0.056 0.064 0.094 0.096 -0.136
## z_g_mn:IDS_ -0.033 0.079 -0.244 -0.374 -0.099 -0.087 0.431 -0.228
full.uk_pvalue <- anova(lmer.full.uk) %>%
as_tibble(rownames = "Parameter") # this gives us the Type III p values
# ==========
# Sequentially removed random effects:
# IDS_pref:z_age_months
# IDS_pref:CDI.z_age_months
# IDS_pref:method
# IDS_pref
# method
# z_age_months
# gender
# CDI.z_age_months
# First, check linearity. The plot looks linear
data.uk$resid <- residuals(lmer.full.uk)
plot(data.uk$resid, data.uk$vocab_nwords)
# Second, check normality
plot_model(lmer.full.uk, type = "diag") ## we do have right-skewed normality of residuals
## [[1]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[2]]
## [[2]]$subid_unique
## `geom_smooth()` using formula 'y ~ x'
##
## [[2]]$labid
## `geom_smooth()` using formula 'y ~ x'
##
##
## [[3]]
##
## [[4]]
## `geom_smooth()` using formula 'y ~ x'
# Third, check autocorrelation
re_run_lme.full.uk <- nlme::lme(vocab_nwords ~ CDI.z_age_months + gender +
z_age_months + method + IDS_pref +
IDS_pref:method + IDS_pref:CDI.z_age_months + IDS_pref:z_age_months,
random = ~ 1 | labid,
method = "REML",
data = data.nae, na.action = na.exclude
)
plot(nlme::ACF(re_run_lme.full.uk, resType = "normalized")) # there is no sign for autocorrelation
# Lastly, check multi-collinearity
car::vif(lmer.full.uk) # no problem for multicollinearlity
## CDI.z_age_months gender z_age_months
## 1.142786 1.125370 1.678295
## method IDS_pref method:IDS_pref
## 1.592431 1.095868 1.457528
## CDI.z_age_months:IDS_pref z_age_months:IDS_pref
## 1.189684 1.730676
We now want to check the statistical power of significant effects,
and discard any models with significant effects that do not reach 80%
power. This however leads to too many warnings of singularity issues on
the model updates inherent to the simr power simulations,
hence we cannot obtain satisfactory power estimates as
pre-registered.
AST: Note that we don’t have any IV(s) that turned out to be significant in the Full NAE model. So we won’t run the power analysis check. For the UK full model, there are two statistically significant IV: CDI_age and gender. The post hoc power check suggested that we have high power in detecting the effect of CDI_age but not gender. Note that gender has a smaller effect size to begin with, so this may partially explain why we have less power in detecting it in the model. As there can be a number of different factors that determines the posthoc power, we decided not to remove gender in the model based on posthoc power analysis check.
check_pwr_uk_cdi_age <- simr::powerSim(lmer.full.uk, test = fixed("CDI.z_age_months", method = "z"), seed = 2, nsim = 1000, alpha = 0.05) # specify that Gender is the ixed effect that we are looking into
check_pwr_uk_cdi_age
check_pwr_uk_gender <- simr::powerSim(lmer.full.uk, test = fixed("gender", method = "z"), seed = 3, nsim = 1000, alpha = 0.05) # specify that Gender is the ixed effect that we are looking into
check_pwr_uk_gender
For this combined analysis, we first need to restrain the age range for the NAE sample (previously ±2 months, now ±1.5 months). [Note: This should be +/- 0.5 months…]
# Create dataset with British and NAE only
data.uk_nae <- data.total %>%
subset(language_zone %in% c("British", "NAE")) %>%
mutate(
CDI.agemin = ifelse(language_zone == "NAE",
CDI.agemin + round(.5 * 365.25 / 12),
CDI.agemin
),
CDI.agemax = ifelse(language_zone == "NAE",
CDI.agemax - round(.5 * 365.25 / 12),
CDI.agemax
)
) %>%
subset(!(CDI.agedays < CDI.agemin | CDI.agedays > CDI.agemax)) %>%
droplevels()
# Create contrasts for analysis
contrasts(data.uk_nae$gender) <- contr.sum(2)
contrasts(data.uk_nae$method) <- contr.sum(3)
contrasts(data.uk_nae$language_zone) <- contr.sum(2)
We can then run the planned combined analysis adding the main effect
and interactions of language_zone.
lmer.full.uk_nae <- lmer(CDI.prop ~ CDI.z_age_months + language_zone + gender +
z_age_months + method + IDS_pref + IDS_pref:language_zone +
IDS_pref:method + IDS_pref:CDI.z_age_months + IDS_pref:z_age_months +
(1 + CDI.z_age_months | labid) + (1 | subid_unique),
data = data.uk_nae
)
summary(lmer.full.uk_nae)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: CDI.prop ~ CDI.z_age_months + language_zone + gender + z_age_months +
## method + IDS_pref + IDS_pref:language_zone + IDS_pref:method +
## IDS_pref:CDI.z_age_months + IDS_pref:z_age_months + (1 +
## CDI.z_age_months | labid) + (1 | subid_unique)
## Data: data.uk_nae
##
## REML criterion at convergence: -56
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.86767 -0.46070 -0.05384 0.41409 2.63524
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## subid_unique (Intercept) 0.0283551 0.16839
## labid (Intercept) 0.0005729 0.02394
## CDI.z_age_months 0.0010798 0.03286 0.45
## Residual 0.0173293 0.13164
## Number of obs: 401, groups: subid_unique, 286; labid, 12
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.322978 0.018753 8.939085 17.223 3.66e-08 ***
## CDI.z_age_months 0.043288 0.010038 11.995084 4.312 0.00101 **
## language_zone1 0.088047 0.022622 8.452255 3.892 0.00413 **
## gender1 0.035741 0.012381 252.339085 2.887 0.00423 **
## z_age_months -0.002581 0.004453 168.109225 -0.580 0.56289
## method1 -0.007231 0.025155 16.577207 -0.287 0.77732
## method2 0.008879 0.036872 12.524954 0.241 0.81360
## IDS_pref 0.031786 0.042931 276.643855 0.740 0.45970
## language_zone1:IDS_pref 0.033524 0.057322 295.425539 0.585 0.55910
## method1:IDS_pref -0.038658 0.055555 298.380191 -0.696 0.48706
## method2:IDS_pref -0.016639 0.088523 285.347212 -0.188 0.85104
## CDI.z_age_months:IDS_pref 0.006813 0.008220 158.076039 0.829 0.40841
## z_age_months:IDS_pref -0.001588 0.012432 265.483177 -0.128 0.89847
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation matrix not shown by default, as p = 13 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
combined.full.uk_nae_pvalue <- anova(lmer.full.uk_nae) %>%
as_tibble(rownames = "Parameter") # this gives us the Type III p values
# ==========
# Sequentially removed random effects:
# IDS_pref:z_age_months
# IDS_pref:CDI.z_age_months
# IDS_pref:method
# IDS_pref:language_zone
# IDS_pref
# method
# z_age_months
# gender
# language_zone
# ==========
# First, check linearity. The plot looks linear
data.uk_nae$resid <- residuals(lmer.full.uk_nae)
plot(data.uk_nae$resid, data.uk_nae$CDI.prop)
# Second, check normality
plot_model(lmer.full.uk_nae, type = "diag") ## we do have right-skewed normality of residuals
## [[1]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[2]]
## [[2]]$subid_unique
## `geom_smooth()` using formula 'y ~ x'
##
## [[2]]$labid
## `geom_smooth()` using formula 'y ~ x'
##
##
## [[3]]
##
## [[4]]
## `geom_smooth()` using formula 'y ~ x'
# Third, check autocorrelation
re_run_lme.full.uk_nae <- nlme::lme(CDI.prop ~ CDI.z_age_months + language_zone + gender +
z_age_months + method + IDS_pref + IDS_pref:language_zone +
IDS_pref:method + IDS_pref:CDI.z_age_months + IDS_pref:z_age_months,
random = ~ CDI.z_age_months | labid,
method = "REML",
data = data.uk_nae, na.action = na.exclude
)
plot(nlme::ACF(re_run_lme.full.uk_nae, resType = "normalized")) # there is no sign for autocorrelation
# Lastly, check multi-collinearity
car::vif(lmer.full.uk_nae) # no problem for multicollinearlity
## GVIF Df GVIF^(1/(2*Df))
## CDI.z_age_months 1.015449 1 1.007695
## language_zone 2.209327 1 1.486380
## gender 1.035132 1 1.017414
## z_age_months 1.488905 1 1.220207
## method 3.162695 2 1.333565
## IDS_pref 1.419845 1 1.191572
## language_zone:IDS_pref 2.765706 1 1.663041
## method:IDS_pref 3.755363 2 1.392076
## CDI.z_age_months:IDS_pref 1.048852 1 1.024135
## z_age_months:IDS_pref 1.496611 1 1.223361
We then compute \(p\)-values, but leave out power estimates for those \(p\)-values as above. Again, we have a lot of singular fit issues for the power checks and decided not to remove parameters based on posthoc power analysis.
check_pwr_combined_cdi_age <- simr::powerSim(lmer.full.uk_nae, test = fixed("CDI.z_age_months", method = "z"), seed = 2, nsim = 1000, alpha = 0.05) # specify that Gender is the ixed effect that we are looking into
check_pwr_combined_cdi_age
check_pwr_combined_lang_zone <- simr::powerSim(lmer.full.uk_nae, test = fixed("language_zone", method = "z"), seed = 3, nsim = 1000, alpha = 0.05) # specify that Gender is the ixed effect that we are looking into
check_pwr_combined_lang_zone
check_pwr_combined_gender <- simr::powerSim(lmer.full.uk_nae, test = fixed("gender", method = "z"), seed = 3, nsim = 1000, alpha = 0.05) # specify that Gender is the ixed effect that we are looking into
check_pwr_combined_gender